{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split \n",
    "from sklearn.model_selection import KFold\n",
    "import numpy as np\n",
    "import time\n",
    "import os\n",
    "\n",
    "# 计时器\n",
    "def timer (func):\n",
    "    def wrapper(*args,**kwargs): \n",
    "        start = time.time()\n",
    "        result = func(*args,**kwargs)\n",
    "        end = time.time()\n",
    "        print(func.__name__+'运行时间：',end-start)\n",
    "        return result\n",
    "    return wrapper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "@timer\n",
    "def load_train():\n",
    "    # 导入数据\n",
    "    train_df = joblib.load('../semi_super/x_train.lz4')\n",
    "    y_train_df = train_df[['tag']]\n",
    "    x_train_df = train_df.drop(columns=['tag'])\n",
    "    x_train_df = x_train_df.fillna(-1)\n",
    "\n",
    "    x_train_arr = x_train_df.values\n",
    "    y_train_arr = y_train_df.values.ravel()\n",
    "    return x_train_arr,y_train_arr\n",
    "\n",
    "@timer\n",
    "def load_unlabel():\n",
    "    unlabel_df = joblib.load('../semi_super/x_unlabel.lz4')\n",
    "    y_unlabel_df = unlabel_df[['tag']].copy()\n",
    "    x_unlabel_df = unlabel_df.drop(columns=['tag'])\n",
    "    x_unlabel_df = x_unlabel_df.fillna(-1)\n",
    "    \n",
    "    y_unlabel_arr = y_unlabel_df.tag.values.ravel()\n",
    "    x_unlabel_arr = x_unlabel_df.values\n",
    "    return x_unlabel_arr,y_unlabel_arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "load_train运行时间： 7.500000953674316\n",
      "load_unlabel运行时间： 16.499998807907104\n"
     ]
    }
   ],
   "source": [
    "x_train_arr,y_train_arr = load_train()\n",
    "x_unlabel_arr,y_unlabel_arr = load_unlabel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def SelectModel(model_name):\n",
    "    if model_name == 'XGB':\n",
    "        from xgboost import XGBClassifier\n",
    "\n",
    "        model = XGBClassifier(max_depth=6,\n",
    "                              learning_rate =0.04, \n",
    "                              booster='gbtree',\n",
    "                              objective='binary:logistic',\n",
    "                              early_stopping_rounds=100,\n",
    "                              scale_pos_weight=13,\n",
    "                              eval_metric='auc',\n",
    "                              gamma=1,\n",
    "                              reg_lambda=1,\n",
    "                              subsample=0.9,\n",
    "                              min_child_weight=1,\n",
    "                              seed=2018,\n",
    "                              silent=False,\n",
    "                              n_jobs=24,\n",
    "                              num_boost_round =300\n",
    "                             )\n",
    "    elif model_name == 'RFC':\n",
    "        from sklearn.ensemble import RandomForestClassifier\n",
    "        model = RandomForestClassifier(n_estimators=2000,\n",
    "                                       n_jobs =36,\n",
    "                                       max_features='sqrt',\n",
    "                                       class_weight='balanced',\n",
    "#                                        verbose =1,\n",
    "                                       random_state=2018)\n",
    "    elif model_name == 'LGB':\n",
    "        from lightgbm import LGBMClassifier\n",
    "        model = LGBMClassifier(boost='gbdt',\n",
    "                    num_leaves=135, \n",
    "                    scale_pos_weight=13,\n",
    "                    max_depth=-1,\n",
    "                    learning_rate=.05,\n",
    "                    max_bin=200,\n",
    "                    min_data_in_leaf= 60,\n",
    "                    objective='binary',\n",
    "                    metric='auc',\n",
    "                    num_threads=32,\n",
    "                    slient=False,\n",
    "                    num_boost_round =300)\n",
    "    else:\n",
    "        pass\n",
    "    \n",
    "    return model\n",
    "\n",
    "def get_score(clf,n_folds,x_train,y_train,x_base,y_base):\n",
    "    n_train = x_train.shape[0]\n",
    "    kf = KFold(n_splits=n_folds,random_state=2018)\n",
    "    score = np.zeros((n_train,))\n",
    "    \n",
    "    for i,(train_index,test_index)in enumerate(kf.split(x_train,y_train)):\n",
    "        kf_x_train = np.vstack((x_base,x_train[train_index]))\n",
    "        kf_y_train = np.hstack((y_base,y_train[train_index]))\n",
    "        kf_x_test = x_train[test_index]\n",
    "        \n",
    "        clf.fit(kf_x_train,kf_y_train)\n",
    "        tmp_trafold = clf.predict_proba(kf_x_test)[:,1].ravel()\n",
    "        score[test_index] = tmp_trafold\n",
    "     \n",
    "    return score\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# 第一级\n",
    "model_list = ['RFC','XGB','LGB']\n",
    "clf = SelectModel(model_list[0])\n",
    "score = get_score(clf,5,x_unlabel_arr,y_unlabel_arr,x_train_arr,y_train_arr)\n",
    "joblib.dump(score,'./unlabel_preds/{}.joblib'.format(model_list[0])) \n",
    "for model in model_list[1:]:\n",
    "    clf = SelectModel(model)\n",
    "    tmp = get_score(clf,5,x_unlabel_arr,y_unlabel_arr,x_train_arr,y_train_arr)\n",
    "    joblib.dump(tmp,'./unlabel_preds/{}.joblib'.format(model)) \n",
    "    score += tmp\n",
    "score = score/3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "unlabel_id = pd.read_csv('../../preprocess_data_new/train_ax_nodup.csv',usecols=[0],skiprows=33465).values.ravel()\n",
    "unlabel_tag_score = pd.DataFrame()\n",
    "unlabel_tag_score['id'] = unlabel_id\n",
    "unlabel_tag_score['score'] = score\n",
    "unlabel_tag_score.to_csv('./unlabel_preds/unlabel_tag_score.csv',index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
